YARN: Robust Multi-Tissue RNA-Seq Preprocessing and Normalization
Here is to do PCoA to check which tissues could be merged.
suppressPackageStartupMessages({
library(yarn)
library(stringr)
library(Biobase)
library(dplyr)
library(plotly)
library(doParallel)
library(edgeR)
})
eset <- readRDS('~/cvd/v8/exprs/gtex.v8.rds')
# sex misannotation
checkMisAnnotation(eset, "SEX", columnID = "chr", controlGenes="Y", legendPosition="topleft")
# PCoA for each tissues
checkTissuesToMerge(eset, "SMTS", "SMTSD")
Extract top 1000 features with high variances to calculate the distances.
# using euclidean is just PCA right...?
calcDistance <- function(mat, log=F, comp=1:2, nFeature=1000, distMethod='euclidean', sample=TRUE, ...) {
# modified from `plotCMDS()` in yarn R package
# identify top 1000 high variance genes
library(matrixStats)
if (!log) {
mat <- log2(mat + 1)
#mat <- cpm(mat, log = T, prior.count = 1)
# mat <- log2(mat)
# mat[which(!is.finite(mat))] <- 0 -> mat[which(is.na(mat))]
}
keep <- which(rowSums(mat) > 0)
vars <- rowSds(mat[keep,])
idx <- keep[order(vars, decreasing = TRUE)[seq_len(nFeature)]]
mat <- mat[idx,]
if (sample) mat <- t(mat)
# calculate distances
d <- dist(mat, method=distMethod)
ord <- as.data.frame(cmdscale(d, k=max(comp)))
ord
}
getColors <- function(n) {
# assgin a color to each group of samples
library(RColorBrewer)
#col_all <- brewer.pal(n, "Paired")
col <- brewer.pal.info[brewer.pal.info$category=='qual', ] # get max. 74 colours
col_all <- unlist(mapply(brewer.pal, col$maxcolors, rownames(col)))
ifelse (n > length(col_all),
cvec <- sample(col_all, n, replace=T),
cvec <- sample(col_all, n, replace=F)
)
cvec
}
sex.eset <- eset[which(fData(eset)$chr=='Y'),]
sex.dist <- calcDistance(exprs(sex.eset), comp = 1:3)
##
## Attaching package: 'matrixStats'
## The following object is masked from 'package:dplyr':
##
## count
## The following objects are masked from 'package:Biobase':
##
## anyMissing, rowMedians
sex.dist$SEX <- factor(pData(sex.eset)$SEX)
sex.dist$SEX <- ifelse(sex.dist$SEX==1, 'Male', 'Female')
cols <- c('#965F8A', '#4AC6B7')
plot_ly(sex.dist, x = ~V1, y = ~V2, z=~V3, color = ~SEX, colors = cols, size = 3) %>%
add_markers() %>%
layout(scene = list(xaxis = list(title = 'MDS1'),
yaxis = list(title = 'MDS2'),
zaxis = list(title = 'MDS3')))
## Warning: `arrange_()` is deprecated as of dplyr 0.7.0.
## Please use `arrange()` instead.
## See vignette('programming') for more help
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_warnings()` to see where this warning was generated.
# what is the third cluster???
trd <- sex.dist[sex.dist$V2 > 20,]
trdID <- rownames(trd)
trdPdata <- pData(eset)[trdID,]
unique(factor(trdPdata$SMTSD))
## [1] Testis
## Levels: Testis
# try without testis
sex.eset <- sex.eset[,which(!pData(sex.eset)$SMTSD %in% "Testis")]
sex.dist <- calcDistance(exprs(sex.eset), comp = 1:3)
sex.dist$SEX <- factor(pData(sex.eset)$SEX)
sex.dist$SEX <- ifelse(sex.dist$SEX==1, 'Male', 'Female')
cols <- c('#965F8A', '#4AC6B7')
plot_ly(sex.dist, x = ~V1, y = ~V2, z=~V3, color = ~SEX, colors = cols, size = 3) %>%
add_markers() %>%
layout(scene = list(xaxis = list(title = 'MDS1'),
yaxis = list(title = 'MDS2'),
zaxis = list(title = 'MDS3')))
brain.eset <- eset[,which(pData(eset)$SMTS=='Brain')]
brain.eset <- brain.eset[which(fData(brain.eset)$chr!='Y'),]
brain.dist <- calcDistance(exprs(brain.eset), comp = 1:3)
brain.dist$SMTSD <- factor(gsub('^Brain - ', '', pData(brain.eset)$SMTSD))
#cols <- getColors(length(unique(pData(brain.eset)$SMTSD)))
cols <- c("#386CB0", "#9f4c13", "#7de675", "#9894cc", "#deb791", "#2d9914", "#bfbfbf", "#fcceca", "#FB8072", "#666666", "#FFD92F", "#afeeee", "#FFFF6B")
plot_ly(brain.dist, x = ~V1, y = ~V2, z = ~V3, color = ~SMTSD, colors = cols, size = 2.5) %>%
add_markers() %>%
layout(scene = list(xaxis = list(title = 'MDS1'),
yaxis = list(title = 'MDS2'),
zaxis = list(title = 'MDS3')))
skin.eset <- eset[,which(pData(eset)$SMTS=='Skin')]
skin.eset <- skin.eset[which(fData(skin.eset)$chr!='Y'),]
skin.dist <- calcDistance(exprs(skin.eset), comp = 1:3)
skin.dist$SMTSD <- factor(pData(skin.eset)$SMTSD)
cols <- c('#00c1a3', '#1972A4', '#FF7070')
plot_ly(skin.dist, x = ~V1, y = ~V2, z = ~V3, color = ~SMTSD, colors = cols, size = 2.5) %>%
add_markers() %>%
layout(scene = list(xaxis = list(title = 'MDS1'),
yaxis = list(title = 'MDS2'),
zaxis = list(title = 'MDS3')))
The clusters of breast tissues are not due to Y genes, mostly from autosomal genes.
breast.eset <- eset[,which(pData(eset)$SMTS=='Breast')]
breast.eset <- breast.eset[which(fData(breast.eset)$chr!='Y'),]
breast.dist <- calcDistance(exprs(breast.eset), comp = 1:3)
breast.dist$SEX <- pData(breast.eset)$SEX
breast.dist$SEX <- ifelse(breast.dist$SEX==1, 'Male', 'Female')
cols <- cols <- c('#965F8A', '#4AC6B7')
plot_ly(breast.dist, x = ~V1, y = ~V2, z = ~V3, color = ~SEX, colors = cols, size = 3) %>%
add_markers() %>%
layout(scene = list(xaxis = list(title = 'MDS1'),
yaxis = list(title = 'MDS2'),
zaxis = list(title = 'MDS3')))